Expression analysis pipeline (unsupervised)

hZhu & xLi

2023-03-08

This vignette documents the unsupervised analysis pipeline for RNA expression data.

Prepare clean sample info and expression matrix

Load data and prepare clean RNA sample info and protein expressions with specified parameters.

Code
expressions_file <- gzfile(system.file("extdata/expressions.csv.gz",
  package = "expr"), "rt")
RNA_sample_info_file <- system.file("extdata/RNA_sample_info.csv",
  package = "expr")
RNA_clinic_info_file <- system.file("extdata/RNA_clinic_info.csv",
  package = "expr")

RNA_sample_info <- read.csv(RNA_sample_info_file, header = T,
  stringsAsFactors = F, check.names = F)
RNA_clinic_info <- read.csv(RNA_clinic_info_file, header = T,
  stringsAsFactors = F, check.names = F)
RNA_sample_info <- table_org(list(RNA_sample_info, RNA_clinic_info))
expressions <- read.csv(expressions_file, header = T, stringsAsFactors = F,
  check.names = F)
load(system.file("extdata/protein_coding_ensemble2symbol.RData",
  package = "expr"))
load(system.file("extdata/protein_coding_gene_info.RData", package = "expr"))

# set param
paramInput <- data.frame(sample_id_col = "Sample_ID", gene_id_col = "gene_id",
  mark_library_size = T, adjusted_by_library_size = T, tolerant_library_size_factor = 1.1,
  ens_id_col = "Gene_ID", symbol_col = "Gene_Symbol", mark_purity = T,
  remove_low_purity_sample = T, low_purity_threshold = 0.3,
  mark_lowexpressgene_pct = T, lowexpression_threshold = 1,
  remove_sample_with_intense_lowexpressgene = T, outlier_lowexpressgene_pct_factor = 1.5,
  remove_lowexpressgene = T, sample_frequency_threshold = 0.5)

# clean
tmp <- expression(prepare_clean_RNA_sample_info_and_protein_expressions(RNA_sample_info = RNA_sample_info,
  sample_id_col = paramInput[["sample_id_col"]], expressions = expressions,
  gene_id_col = paramInput[["gene_id_col"]], mark_library_size = paramInput[["mark_library_size"]],
  adjusted_by_library_size = paramInput[["adjusted_by_library_size"]],
  tolerant_library_size_factor = paramInput[["tolerant_library_size_factor"]],
  protein_coding_ensemble2symbol_table = protein_coding_ensemble2symbol,
  ens_id_col = paramInput[["ens_id_col"]], symbol_col = paramInput[["symbol_col"]],
  mark_purity = paramInput[["mark_purity"]], remove_low_purity_sample = paramInput[["remove_low_purity_sample"]],
  low_purity_threshold = paramInput[["low_purity_threshold"]],
  mark_lowexpressgene_pct = paramInput[["mark_lowexpressgene_pct"]],
  lowexpression_threshold = paramInput[["lowexpression_threshold"]],
  remove_sample_with_intense_lowexpressgene = paramInput[["remove_sample_with_intense_lowexpressgene"]],
  outlier_lowexpressgene_pct_factor = paramInput[["outlier_lowexpressgene_pct_factor"]],
  remove_lowexpressgene = paramInput[["remove_lowexpressgene"]],
  sample_frequency_threshold = paramInput[["sample_frequency_threshold"]]))

consoleOutput <- capture.output(results <- eval(tmp), type = c("output"))

# data loading summary
SumInfo <- t(data.frame(paramInput[, grep("factor|threshold",
  colnames(paramInput))], Number_of_Cleaned_Samples = dim(results$clean_RNA_sample_info)[1],
  Number_of_Cleaned_Genes = dim(results$clean_protein_expressions)[1],
  Number_of_Deleted_Samples = dim(results$marked_ori_RNA_sample_info)[1] -
    dim(results$clean_RNA_sample_info)[1], Notes = grep("too many low expression genes",
    consoleOutput, value = T)))
t1 <- knitr::kable(SumInfo, caption = "Data loading summary") %>%
  kableExtra::kable_styling(font_size = "200%")


# cleaned RNA sample info
t2 <- format(results$clean_RNA_sample_info, nsmall = 2) %>%
  DT::datatable(options = list(scrollX = T, scrollY = T), caption = htmltools::tags$caption(style = "caption-side: top; text-align: left; color:black; font-size:200% ;",
    "Sample Info (cleaned)"))

# log2 transformation
clean_RNA_sample_info <- results[["clean_RNA_sample_info"]]
clean_protein_expressions <- results[["clean_protein_expressions"]]  # it is not log2 transformed yet
log2_clean_protein_expressions <- log2(clean_protein_expressions +
  1)

# define class of sample attributes
coerce_ind <- names(which(sapply(clean_RNA_sample_info[grep("ID|MRN",
  colnames(clean_RNA_sample_info))], is.numeric)))
clean_RNA_sample_info[, coerce_ind] <- as.character(clean_RNA_sample_info[,
  coerce_ind])

Show data loading summary.

Data loading summary
tolerant_library_size_factor 1.1
low_purity_threshold 0.3
lowexpression_threshold 1
outlier_lowexpressgene_pct_factor 1.5
sample_frequency_threshold 0.5
Number_of_Cleaned_Samples 59
Number_of_Cleaned_Genes 13756
Number_of_Deleted_Samples 1
Notes Sample Cap2115-6-ID04 has (have) too many low expression genes: 84.6735024284943 % respectively.

Show cleaned RNA sample info.

Batch effects

Visualization of sample distribution via unsupervised clustering

In this step, genes with higher variance across data set and a fill rate greater than 0.75 are used for unsupervised analysis.

Code
rm(list = setdiff(ls(), c(lsf.str(), grep("clean|protein_coding_gene_info",
  ls(), value = T))))

# set param
paramInput <- data.frame(method = "MAD", mad_top_n = 1000, remove_outlier = F)

# prep data
unsupervised_data <- prepare_unsupervised_data(expressions = log2_clean_protein_expressions,
  method = paramInput[["method"]], mad_top_n = paramInput[["mad_top_n"]],
  remove_outlier = paramInput[["remove_outlier"]])
assertthat::are_equal(colnames(unsupervised_data), clean_RNA_sample_info[["Sample_ID"]])
#> [1] TRUE

# run analysis
unsupervised_results <- unsupervised_analysis(unsupervised_data,
  labels = clean_RNA_sample_info$PROTOCOL, cols = c(`2014-0938` = "green",
    `2016-0861` = "blue", `LAB08-0380` = "red"))

# generate plots
p1 <- unsupervised_results$plots$pca$`3D` %>%
  layout(title = "PCA", autosize = F)

p2 <- ggplotly(unsupervised_results$plots$umap + theme_bw() +
  theme(legend.position = "none") + geom_point(aes(text = sprintf("sample: %s",
  colnames(unsupervised_data)))), tooltip = c("x", "y", "Label",
  "text")) %>%
  layout(title = "UMAP")

p3 <- ggplotly(unsupervised_results$plots$tsne + theme_bw() +
  theme(legend.position = "none") + geom_point(aes(text = sprintf("sample: %s",
  colnames(unsupervised_data)))), tooltip = c("x", "y", "Label",
  "text")) %>%
  layout(title = "tSNE")

p4 <- ggplotly(unsupervised_results$plots$mds + theme_bw() +
  theme(legend.position = "none") + geom_point(aes(text = sprintf("sample: %s",
  colnames(unsupervised_data)))), tooltip = c("x", "y", "Label",
  "text")) %>%
  layout(title = "MDS")

p5 <- unsupervised_results$plots$heatmap
Code
a <- data.frame(x = c(0.2, 0.8, 0.2, 0.8), y = c(1, 1, 0.45,
  0.45), text = c("PCA", "UMAP", "tSNE", "MDS"), xref = "paper",
  yref = "paper", xanchor = "center", yanchor = "bottom", showarrow = FALSE)
annotations <- apply(a, 1, function(x) lapply(as.list(x), type.convert,
  as.is = TRUE))

# grid.draw(cowplot::get_legend(unsupervised_results$plots$umap+theme(legend.position
# = 'top')))
p <- subplot(list(style(p1, showlegend = F), p2, p3, p4), nrows = 2,
  margin = 0.1) %>%
  layout(title = "Unsupervised clustering before correction",
    annotations = annotations, scene = list(domain = list(x = c(0,
      0.5), y = c(0.5, 1))))

browsable(tagList(tags$div(style = "width:100%;height=12;display:block;float:left;",
  p)))
Code
cat("<br>")


Code
p5

Determine if batch effect exist

Call Batch effect “exist”, if k-means clustering match with batch definition, and that clusters are clearly separated ( evaluated by Silhouette score and withinss/betweenss). If batch_effect_exist==TRUE, proceed to correction; Otherwise, results in next section stays blank.

Code
kmean.df <- (unsupervised_results$ress$umap$layout)
# calculate Silhouette score for k from 2 to 2 times number
# of batches and determine the optimal k for clustering
avg_sil <- function(k) {
  km.res <- kmeans(kmean.df, centers = k, nstart = 25)
  ss <- cluster::silhouette(km.res$cluster, dist(kmean.df))
  mean(ss[, 3])
}
k.values <- 2:(2 * (length(unique(clean_RNA_sample_info$PROTOCOL))))
avg_sil_values <- sapply(k.values, avg_sil)
km.res <- kmeans(kmean.df, centers = k.values[which.max(avg_sil_values)],
  nstart = 25)

# test if good clustering
good <- all(c(km.res$betweenss/km.res$totss > 0.5, max(avg_sil_values) >
  0.1))

# test if k-means clustering match with batch definition
clusters <- lapply(unique(km.res$cluster), function(x) names(km.res$cluster)[km.res$cluster ==
  x])
batches <- lapply(unique(clean_RNA_sample_info$PROTOCOL), function(x) clean_RNA_sample_info$Sample_ID[clean_RNA_sample_info$PROTOCOL ==
  x])
hits <- c()
i <- 0
for (ncluster in 1:length(clusters)) {
  for (nbatches in 1:length(batches)) {
    hit <- length(setdiff(clusters[[ncluster]], batches[[nbatches]])) ==
      0 & length(setdiff(batches[[nbatches]], clusters[[ncluster]])) ==
      0
    hits <- c(hits, hit)
  }
}
matched <- any(hits) == T

batch_effect_exist <- all(matched, good)
cat(sprintf("Clustering good?: **%s**<br>
            Clustering matched?: **%s**<br>
            Batch effect exists: **%s**<br>",
  good, matched, batch_effect_exist))

Clustering good?: TRUE
Clustering matched?: TRUE
Batch effect exists: TRUE

Batch effect correction

If batch effect exist, attempt to use limma and COMBAT to correct such effect

Code
unsupervised_resultss <- list()
if (batch_effect_exist) {
  # limma
  limma_rbe_log2_protein_expressions <- limma::removeBatchEffect(log2_clean_protein_expressions,
    batch = clean_RNA_sample_info$PROTOCOL)
  uns_limma_rbe_log2_protein_expressions <- prepare_unsupervised_data(limma_rbe_log2_protein_expressions,
    method = "MAD", mad_top_n = 1000)
  unsupervised_resultss[["limma"]] <- unsupervised_analysis(uns_limma_rbe_log2_protein_expressions,
    labels = clean_RNA_sample_info$PROTOCOL, cols = c(`2014-0938` = "green",
      `2016-0861` = "blue", `LAB08-0380` = "red"))
  # combat
  combat_rbe_log2_protein_expressions <- sva::ComBat(log2_clean_protein_expressions,
    batch = clean_RNA_sample_info$PROTOCOL)
  uns_combat_rbe_log2_protein_expressions <- prepare_unsupervised_data(combat_rbe_log2_protein_expressions,
    method = "MAD", mad_top_n = 1000)
  unsupervised_resultss[["combat"]] <- unsupervised_analysis(uns_combat_rbe_log2_protein_expressions,
    labels = clean_RNA_sample_info$PROTOCOL, cols = c(`2014-0938` = "green",
      `2016-0861` = "blue", `LAB08-0380` = "red"))


  # generate plots
  p1 <- p2 <- p3 <- p4 <- p5 <- list()
  for (method in c("limma", "combat")) {
    unsupervised_results <- unsupervised_resultss[[method]]
    p1[[method]] <- unsupervised_results$plots$pca$`3D` %>%
      layout(title = sprintf("PCA %s", method), autosize = F)

    p2[[method]] <- ggplotly(unsupervised_results$plots$umap +
      theme_bw() + theme(legend.position = "none") + geom_point(aes(text = sprintf("sample: %s",
      colnames(unsupervised_data)))), tooltip = c("x",
      "y", "Label", "text")) %>%
      layout(title = sprintf("UMAP %s", method))

    p3[[method]] <- ggplotly(unsupervised_results$plots$tsne +
      theme_bw() + theme(legend.position = "none") + geom_point(aes(text = sprintf("sample: %s",
      colnames(unsupervised_data)))), tooltip = c("x",
      "y", "Label", "text")) %>%
      layout(title = sprintf("tSNE %s", method))

    p4[[method]] <- ggplotly(unsupervised_results$plots$mds +
      theme_bw() + theme(legend.position = "none") + geom_point(aes(text = sprintf("sample: %s",
      colnames(unsupervised_data)))), tooltip = c("x",
      "y", "Label", "text")) %>%
      layout(title = sprintf("MDS %s", method))

    p5[[method]] <- unsupervised_results$plots$heatmap
  }
}

Correction result

limma result
Code
if (batch_effect_exist) {
  a <- data.frame(x = c(0.2, 0.8, 0.2, 0.8), y = c(1, 1, 0.45,
    0.45), text = c("PCA", "UMAP", "tSNE", "MDS"), xref = "paper",
    yref = "paper", xanchor = "center", yanchor = "bottom",
    showarrow = FALSE)
  annotations <- apply(a, 1, function(x) lapply(as.list(x),
    type.convert, as.is = TRUE))

  pp1 <- p1[["limma"]]
  pp2 <- p2[["limma"]]
  pp3 <- p3[["limma"]]
  pp4 <- p4[["limma"]]
  p <- subplot(list(style(pp1, showlegend = F), pp2, pp3, pp4),
    nrows = 2, margin = 0.1) %>%
    layout(title = "Unsupervised clustering after correction",
      annotations = annotations, scene = list(domain = list(x = c(0,
        0.5), y = c(0.5, 1))))

  browsable(tagList(tags$div(style = "width:100%;height=12;display:block;float:left;",
    p)))
}
Code
if (batch_effect_exist) {
  cat("<br>")
  p5[["limma"]]
}


COMBAT result
Code
if (batch_effect_exist) {
  a <- data.frame(x = c(0.2, 0.8, 0.2, 0.8), y = c(1, 1, 0.45,
    0.45), text = c("PCA", "UMAP", "tSNE", "MDS"), xref = "paper",
    yref = "paper", xanchor = "center", yanchor = "bottom",
    showarrow = FALSE)
  annotations <- apply(a, 1, function(x) lapply(as.list(x),
    type.convert, as.is = TRUE))

  pp1 <- p1[["combat"]]
  pp2 <- p2[["combat"]]
  pp3 <- p3[["combat"]]
  pp4 <- p4[["combat"]]
  p <- subplot(list(style(pp1, showlegend = F), pp2, pp3, pp4),
    nrows = 2, margin = 0.1) %>%
    layout(title = "Unsupervised clustering after correction",
      annotations = annotations, scene = list(domain = list(x = c(0,
        0.5), y = c(0.5, 1))))

  browsable(tagList(tags$div(style = "width:100%;height=12;display:block;float:left;",
    p)))
}
Code
if (batch_effect_exist) {
  cat("<br>")
  p5[["combat"]]
}
#> <br>

Downstream analysis

If batch effect correction was done, use post-batch-effect-removal expression matrix for downstream analysis, eg. unsupervised clustering. Below, limma-corrected expressions were used.

unsupervised

Code
if (batch_effect_exist) {
  log2_expressions <- limma_rbe_log2_protein_expressions
} else {
  log2_expressions <- log2_clean_protein_expressions
}

# clear environment
rm(list = setdiff(ls(), c(grep("log2_expressions|info", ls(),
  value = T))))
# save(clean_RNA_sample_info,log2_expressions,file =
# 'data/woodman.RData')
unsupervised_expressions <- prepare_unsupervised_data(log2_expressions,
  method = "MAD", mad_top_n = 1000)

heatmap_mRNA <- draw(Heatmap(t(scale(t(unsupervised_expressions))),
  name = "expression\nz_score", top_annotation = getHeatMapAnnotation(sampleAttr = clean_RNA_sample_info,
    data_mx = log2_expressions, essentialOnly = T), show_row_names = F,
  column_dend_reorder = T, show_column_names = F, column_names_gp = gpar(fontsize = 1)))

Differential gene expression (DGE) analysis

comparison between Baseline group and TP2 group

Code
comp_limma <- dge_limma(log2_expressions, is_rawcount = F, is_logged = T,
  normalize = F, sample_frequency_threshold = 0.5, clinic_info = clean_RNA_sample_info,
  ID_col = "Sample_ID", group_col = "APOLLO_TIMPOINT", contrasts = c("TP2-Baseline"),
  method = "limma_trend")
#> non raw_count data will be analyzed with limma
stats <- cbind(comp_limma$statistics, protein_coding_gene_info[match(rownames(comp_limma$statistics),
  protein_coding_gene_info$Gene_name), -1])
stats <- stats[order(stats[["TP2-Baseline:P.Value"]]), ]

format(stats[stats[["TP2-Baseline:P.Value"]] < 0.05, ], nsmall = 2) %>%
  DT::datatable(options = list(scrollX = T, scrollY = T), caption = htmltools::tags$caption(style = "caption-side: top; text-align: left; color:black; font-size:200% ;",
    "DGE: baseline VS TP2"))

Paired comparison between Baseline and TP2 samples

Code

paired_RNA_sample_info <- clean_RNA_sample_info[table(clean_RNA_sample_info$MRN) >
  1, ]
# paired_RNA_sample_info<-clean_RNA_clinic[match(paired_RNA_sample_info$Sample_ID,clean_RNA_clinic$Sample_ID),]
paired_log2_expressions <- log2_expressions[, paired_RNA_sample_info$Sample_ID]
assertthat::are_equal(colnames(paired_log2_expressions), paired_RNA_sample_info$Sample_ID)
#> [1] TRUE
paired_comp_limma <- dge_limma(paired_log2_expressions, is_rawcount = F,
  is_logged = T, normalize = F, sample_frequency_threshold = 0.5,
  clinic_info = paired_RNA_sample_info, ID_col = "Sample_ID",
  group_col = "APOLLO_TIMPOINT", contrasts = c("TP2-Baseline"),
  method = "limma_trend")
#> non raw_count data will be analyzed with limma
stats <- cbind(paired_comp_limma$statistics, protein_coding_gene_info[match(rownames(paired_comp_limma$statistics),
  protein_coding_gene_info$Gene_name), -1])
stats <- stats[order(stats[["TP2-Baseline:P.Value"]]), ]

format(stats[stats[["TP2-Baseline:P.Value"]] < 0.05, ], nsmall = 2) %>%
  DT::datatable(options = list(scrollX = T, scrollY = T), caption = htmltools::tags$caption(style = "caption-side: top; text-align: left; color:black; font-size:200% ;",
    "DGE: paired baseline VS TP2"))